Lab 10 Objectives:
library(tidyverse) # The tidyverse!
## ── Attaching packages ──────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(corrplot) # For correlation matrices
## corrplot 0.84 loaded
library(janitor) # For cleaning up column names
library(lubridate) # For dealing with dates & times
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(gganimate) # For adding animation to ggplot graphics
library(ggimage) # For updating graph components with images
Compiled World Bank data, accessed from: https://www.kaggle.com/zanderventer/environmental-variables-for-world-countries#World_countries_env_vars.csv
env_var <- read_csv("world_env_vars.csv") %>%
na.omit
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character()
## )
## See spec(...) for full column specifications.
cor_df <- cor(env_var[2:28])
corrplot(cor_df,
type = "upper",
method = "ellipse",
tl.col = "black",
tl.cex = .5) #only shows "upper" half of symmetrical plot, directionality showed by "ellipse" shape, text label color is "black", text label size is ".5"
Use the ‘glm’ function for fitting generalized linear models (the logit - log odds of survival, in our case, will be linearly related to Sex and Age. So we expect the final model to look something like this:
\[Log Odds (Survival) = \beta_0 + \beta_1(Age) + \beta_2(Sex)\]
We’ll use ‘family = binomial’ to run binomial logistic regression…otherwise, this looks very similar to other types of regression we’ve already done.
DonnerTable <- read_csv("DonnerTable.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## Age = col_integer(),
## Outcome = col_integer(),
## Sex = col_character(),
## Family.name = col_character(),
## Status = col_character()
## )
1 = survival 0 = death
donner_blr <- glm(Outcome ~ Sex + Age, family = "binomial", data = DonnerTable)
summary(donner_blr)
##
## Call:
## glm(formula = Outcome ~ Sex + Age, family = "binomial", data = DonnerTable)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8828 -1.0383 0.6511 1.0261 1.7386
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.62180 0.50279 3.226 0.00126 **
## SexMale -1.06798 0.48229 -2.214 0.02680 *
## Age -0.03561 0.01525 -2.336 0.01952 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 120.86 on 87 degrees of freedom
## Residual deviance: 108.87 on 85 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 114.87
##
## Number of Fisher Scoring iterations: 4
# 1. Create a data frame with variables Sex and Age, containing data "Female" and 5, respectively:
f_5 <- data.frame(Sex = "Female", Age = 5)
# 2. Find the log odds of survival for the new data (5 year old female) using predict() function with type = "link":
f_5_logodds <- predict(donner_blr, newdata = f_5, type = "link")
# 3. Exponentiate the log odds to find ODDS of survival for a 5 year old female:
exp(f_5_logodds)
## 1
## 4.23666
# Ask: Could we manually find the probability of survival for a 5-year old female? recall: p/(1-p) = ODDS --> p = 80%
# 4. Actually, let's just use type = "response" in the predict function, which converts to a probability for us:
f_5_prob <- predict(donner_blr, newdata = f_5, type = "response")
f_5_prob
## 1
## 0.8090386
# Similarly:
m_25 <- data.frame(Sex = "Male", Age = 25) # Make a new data frame
m_25_prob <- predict(donner_blr, newdata = m_25, type = "response") # Find probability of survival
m_25_prob
## 1
## 0.4167068
seq_age <- rep(seq(from = 0, to = 100), 2) # Create a sequence from 0 to 100, twice (one will be "Male" and one will be "Female")
f_101 <- rep("Female", 101) # Repeat 'Female' 101 times (to match years data)
m_101 <- rep("Male", 101) # Repeat 'Male' 101 times
mf_101 <- c(f_101, m_101) # Combine them into a single vector
# Combine the age and sex sequences into a single data frame - that will be the new data that we have our model make predictions for
donner_newdata <- data.frame(seq_age, mf_101) # MUST make column names match variables in the model!
colnames(donner_newdata) <- c("Age","Sex")
# Find probabilities using predict (with type = "response"). Include SE.
predicted_probs <- predict(donner_blr, newdata = donner_newdata, type = "response", se.fit = TRUE)
# Coerce outcome into data frame.
graph_data <- data.frame(donner_newdata, predicted_probs$fit, predicted_probs$se.fit)
colnames(graph_data) <- c("Age", "Sex", "Probability", "SE")
ggplot(graph_data, aes(x = Age, y = Probability)) +
geom_line(aes(color = Sex)) +
geom_ribbon(aes(ymin = Probability - SE, ymax = Probability + SE, fill = Sex, alpha = .5))
Our graph corroborates our results from our model: females have higher probability of survival than males, and younger ages have higher probability tha older ages.
si_full <- list.files(pattern = "solar_irradiation_*") %>%
map_df(~read_csv(.)) %>%
clean_names()
## Parsed with column specification:
## cols(
## .default = col_integer(),
## `YYYY-MM-DD` = col_character(),
## `HH:MM (LST)` = col_time(format = ""),
## `Zenith (deg)` = col_double(),
## `Azimuth (deg)` = col_double(),
## `Precip Wat (cm)` = col_double(),
## `AOD (unitless)` = col_double(),
## `AOD RAN (unitless)` = col_double(),
## `Ozone (cm)` = col_double(),
## `Albedo (unitless)` = col_double(),
## site = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_integer(),
## `YYYY-MM-DD` = col_character(),
## `HH:MM (LST)` = col_time(format = ""),
## `Zenith (deg)` = col_double(),
## `Azimuth (deg)` = col_double(),
## `Precip Wat (cm)` = col_double(),
## `AOD (unitless)` = col_double(),
## `AOD RAN (unitless)` = col_double(),
## `Ozone (cm)` = col_double(),
## `Albedo (unitless)` = col_double(),
## site = col_character()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_integer(),
## `YYYY-MM-DD` = col_character(),
## `HH:MM (LST)` = col_time(format = ""),
## `Zenith (deg)` = col_double(),
## `Azimuth (deg)` = col_double(),
## `Precip Wat (cm)` = col_double(),
## `AOD (unitless)` = col_double(),
## `AOD RAN (unitless)` = col_double(),
## `Ozone (cm)` = col_double(),
## `Albedo (unitless)` = col_double(),
## site = col_character()
## )
## See spec(...) for full column specifications.
solar_tidy <- si_full %>%
rename(
sol_rad = etr_wh_m_2,
date = yyyy_mm_dd,
time = hh_mm_lst
) %>%
filter(time != "NA") %>%
mutate(site = fct_relevel(site, "Hawaii", "Santa Barbara", "Alaska"))
solar_tidy$date <- mdy(solar_tidy$date)
solar_tidy$time <-hms(solar_tidy$time)
solar_gg <- ggplot(solar_tidy, aes(x = date, y = time)) +
geom_tile(aes(fill = sol_rad)) +
scale_fill_gradientn(colors = c("royalblue2", "mediumorchid1", "orange", "yellow")) +
scale_y_time() +
facet_grid(site ~ .)
solar_gg
aq_df <-read_csv("aq_wb.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## country = col_character(),
## code = col_character(),
## indicator = col_character(),
## ind_code = col_character(),
## `1960` = col_integer(),
## `1961` = col_integer(),
## `1962` = col_integer(),
## `1963` = col_integer(),
## `1964` = col_integer(),
## `1965` = col_integer(),
## `1966` = col_integer(),
## `1967` = col_integer(),
## `1968` = col_integer(),
## `1969` = col_integer(),
## `1970` = col_integer(),
## `1971` = col_integer(),
## `1972` = col_integer(),
## `1973` = col_integer(),
## `1974` = col_integer(),
## `1975` = col_integer()
## # ... with 9 more columns
## )
## See spec(...) for full column specifications.
aq_tidy <- aq_df %>%
filter(country == "Brazil" |
country == "Chile" |
country == "Ecuador" |
country == "United States") %>%
gather(year, aq_prod, `1960`:`2014`) %>%
filter(year >= 1990) %>%
mutate(aq_mil = aq_prod/1000000) %>%
select(country, year, aq_mil)
#gather() creates a new column called year, and a new column called aq_prod, based on data from columns 1960-2014 (i.e., converts data from wide format to long format) (n.b., spread() converts data from long format to wide format)
fish <-"fish.png"
aq_plot <- ggplot(aq_tidy, aes(x = as.numeric(year), y = aq_mil, group = country)) +
geom_line(aes(color = country)) +
geom_point(aes(color = country)) +
geom_image(aes(image = fish)) +
geom_text(aes(label = country, color = country), position = position_nudge(y = .04, x = 1), size = 5) +
transition_reveal(country, as.numeric(year))
## Warning: Ignoring unknown parameters: image_colour
aq_plot